#message=FALSE means some output messages aren't printed in the knitted HTML document
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
#message=FALSE and warning=FALSE ensure any error or output messages aren't shown in the output HTML file
#echo=TRUE ensures that the code used to produce outputs can be accessed through the output file
#loading any necessary packages
library(dplyr)
library(tidyverse)
library(knitr)
library(readr)
library(ggplot2)
library(bbmle)
library(mizer)
library(Hmisc)
library(lme4)
library(RColorBrewer)
library(viridis)
library(hexbin)
library(ggpattern)
library(sjPlot)
load("stomach_dataset.Rdata")
#loading the dataset to be used in analysis (the dataset is already named stom_df)
df <- stom_df %>%
transmute(haul_id, ices_rectangle, year, pred_species,
pred_weight_g, prey_type = prey_funcgrp,
total_prey_weight = prey_weight_g,
prey_count, n_stomachs,
no._prey_per_stmch = prey_count/n_stomachs,
indiv_prey_weight = prey_ind_weight_g,
ppmr)
#creating a new data frame called 'df' which only contains certain selected columns
df <- df[df$indiv_prey_weight != 0, ]
df <- df[df$pred_weight_g != 0, ]
df <- df[df$indiv_prey_weight != Inf, ]
#Removes any points where the prey weight = Inf or 0, or the predator weight = 0
df<- df[!(is.na(df$year)),]
df<- df[!(is.na(df$ices_rectangle)),]
#Removes any points where the value is NA
df$lppmr <- log(df$ppmr)
df$lprey_weight <- log(df$indiv_prey_weight)
df$lpred_weight <- log(df$pred_weight_g)
#Adds columns which take the log value of PPMR, individual prey weight and predator weight to the main data set
renamed_df = df %>%
mutate(pred_species = replace(pred_species,
pred_species == "Micromesistius poutassou", "Blue Whiting")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Gadus morhua", "Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Limanda limanda", "Common Dab")) %>%
mutate(pred_species = replace(pred_species,
pred_species == "Merluccius merluccius", "European Hake")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Melanogrammus aeglefinus", "Haddock")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Clupea harengus", "Herring")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trachurus trachurus", "Horse Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Scomber scombrus", "Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lepidorhombus whiffiagonis", "Megrim")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lophius piscatorius", "Monkfish")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus esmarkii", "Norway Pout")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Pleuronectes platessa", "Plaice")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus minutus", "Poor Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Solea solea", "Sole")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Sprattus sprattus", "Sprat")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Merlangius merlangus", "Whiting"))
#Renaming the predator species from their latin names (e.g. Gadus morhua) to their more common names (e.g. Cod)
#Creates a new data frame (called 'renamed_df') with these replaced names
species_list <- c("Blue Whiting", "Cod", "Common Dab", "European Hake", "Haddock", "Herring",
"Horse Mackerel", "Mackerel", "Megrim", "Monkfish", "Norway Pout",
"Plaice", "Poor Cod", "Sole", "Sprat", "Whiting")
#Creates an alphabetically arranged array called 'species_list', which is list of the predator species we are focusing on in this project
renamed_df <- renamed_df[renamed_df$pred_species %in% species_list, ]
#Removes any observations of predator species not in the species_list, i.e. they are irrelevant data points for this project
This data set is formed from recordings taken by multiple ships between the years 1886-2016. We chose to focus on 16 predator species, as these are the same species analysed in a Mizer model which values from this project will inform. It is important to note that the Mizer model we are looking at also includes the predator species `Boarfish’, however, there are a very limited number of observations for this species in the provided data set, so any analysis of Boarfish cannot be relied upon (and therefore we do not consider observations of the predator species Boarfish).
Fish were taken from some location, and their individual stomach contents recorded. Prey of the same predator species and (roughly) the same size found in a single predator’s stomach were recorded a single observation. We call this number of prey that make up a single observation for some predator stomach ‘no._prey_per_stmch’.
Using similar logic,‘total_prey_weight’ is the total biomass of prey from a single observation, and ‘indiv_prey_weight’ is the approximate weight of one prey individual found in the stomach, defined using:
\[ \text{indiv_prey_weight} = \frac{\text{total_prey_weight}}{\text{no._prey_per_stmch}} \]
There are some other column names which are useful to note:
\[ \text{PPMR} = \frac{\text{predator mass}}{\text{prey mass}} \] (where the prey mass is the ‘indiv_prey_weight’, representing the prey mass of a single prey in the stomach of a single observation)
ggplot(renamed_df, aes(x=prey_type, fill=prey_type)) +
geom_bar_pattern(stat = "count",
pattern_color = "white",
pattern_fill = "white",
aes(pattern = prey_type, pattern_angle = prey_type)) +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
labs(title = "Distribution of prey types for each predator species", x="Prey type",
y="Number of observations") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#geom_bar_pattern() creates a bar chart from the specified data, where each category has a different pattern used to fill it
#Using different patterns as well as different colours to fill the bars ensures the data can be understood if looked at in grey scale, and makes the document accessible for colour blind people
#facet_wrap() means an individual bar chart is created for each individual predator species
#'theme()' adjusts the labels along the x-axis so they lie slightly angled to the vertical direction so they are easy to read
#legend.position="none" means that there is no key to explain what each bar shows, as the x-axis is very clear in describing each bar
These are individual bar charts showing the distribution of the type of prey each predator eats.
The prey types are:
We can see that some predators eat a range of prey types (e.g. Whiting), while others prefer to eat a single prey type in abundance (e.g. European Hake and Monkfish mostly consume fish as their prey).
ggplot(renamed_df, aes(x=lprey_weight, colour=pred_species)) +
geom_density(aes(weight = no._prey_per_stmch)) +
labs(title = "Density of log(prey mass) observations", x="log(prey mass)",
y="Number density of observations") +
facet_wrap(~pred_species, scale="free_y") +
theme(legend.position="none") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#colour=pred_species means that each plot (as separated over individual species) is differently coloured
#theme(legend.position="none") means there is no 'key' printed next to the plots to explain what each plot represents
This plot shows that different predator species consume differently sized distributions of prey. For example, some species have a ‘peak’ at roughly a single size of prey (such as Horse Mackerel and Norway Pout), and some species show a wider range of prey masses which they consume, such as:
Therefore, we can see that each predator species consumes prey in a different way and in different distributions.
renamed_df$haul_id_short <- gsub("\\-.*", "", renamed_df$haul_id)
renamed_df$haul_id_short <- gsub("_", "", renamed_df$haul_id_short)
#the haul_id values are renamed to be shortened versions of the ship names (e.g. CLYDE) rather than the complete id (e.g. CLYDE-1935-6)
ggplot(data = renamed_df, aes(lprey_weight, no._prey_per_stmch)) +
labs(title = "Number of prey per predator stomach v. log(prey mass) ",
x="log(prey mass)", y="Number of prey per predator stomach") +
geom_point(size=0.5) +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#geom_point() adds individual points which show individual recorded points
#size=0.5 defines the size of each point on this graph
This graph is looking at the distribution of the mass of prey recorded, i.e. looking at what is the most common prey mass over all prey species. It is a graph over all the data points, so includes recordings from all the ships involved.
There are some potentially interesting results, so results from different ships were plotted on individual curves to further look into these results.
ggplot (data = renamed_df, aes(x=lprey_weight, y=no._prey_per_stmch, color=haul_id_short)) +
labs(title = "Number of prey per observation v. log(prey mass) - separated by all ship names",
x="log(prey mass)", y="Number of prey per observation") +
geom_point(size=0.02) +
theme(strip.text = element_text(size = 7), legend.position="none",
plot.title = element_text(size=22), axis.title = element_text(size=22)) +
facet_wrap(~renamed_df$haul_id_short, scale="free_y") +
theme(plot.title = element_text(size=25), axis.title = element_text(size=20))
Taking a closer look at some of these ships gives the plots below:
interesting_haul <- filter(renamed_df,
haul_id_short=='CLYDE'|haul_id_short=='END04'|haul_id_short=='LUC'|
haul_id_short=='HIDDINK'|haul_id_short=='EXCmacDATSTO815'|
haul_id_short=="Excmacdatsto815error")
#interesting_haul is a new data frame which only contains values recorded by ships which gave "interesting" looking datapoints
haul_list_interesting <- unique(interesting_haul$haul_id_short)
#creates an array containing every individual (renamed) ship name that we are interested in
ggplot (data = interesting_haul, aes(x=lprey_weight, y=no._prey_per_stmch,
color=haul_id_short)) +
labs(title = "Number of prey per observation v. log(prey mass)
- separated by ship names, for a selection of 6 ships",
x="log(prey mass)", y="Number of prey per observation") +
geom_point(size=0.02) +
theme(strip.text = element_text(size = 10), legend.position="none") +
facet_wrap(~interesting_haul$haul_id_short, scale="free_y") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
These six graphs show a range of unusual looking points:
There are a number of other ships which also provided potentially erroneous data, and the total number of datapoints affected are as below:
type1 <- length((filter(renamed_df, haul_id_short=='END04'))$haul_id_short)
#counting the number of observations recorded for each sort of error or interesting relation
type2 <- length((filter(renamed_df, haul_id_short=='CLION09' | haul_id_short=='LUC' | haul_id_short=='UNITY' | haul_id_short=='CLYDE'))$haul_id_short)
type4 <- length((filter(renamed_df, haul_id_short=='EXCmacDATSTO815'|haul_id_short=='Excmacdatsto815error'))$haul_id_short)
ship_erroneous <- data.frame("Graph_Type"=c("y proportional to exp(-x)", "Single prey mass"
, "Same data"),
"Number_of_observations"=c(type1, type2, type4))
#creates a data frame which shows the number of observations of each error 'type'
kable(ship_erroneous)
| Graph_Type | Number_of_observations |
|---|---|
| y proportional to exp(-x) | 2093 |
| Single prey mass | 554 |
| Same data | 5618 |
#creates a table of the ship_erroneous data frame as an output
In total, 8265 out of the total 267431 observations lie within these potentially erroneous groups. This is: \[ \frac{8265}{267431} \times 100 = 3.090517 \% \]
This means that approximately \(3.1\%\) of the data set comes from potentially unreliable sources. Though we will not remove or alter the data set to account for these potentially erroneous data points in this project, for further analysis it may be sensible to remove the data points these six ships recorded so that any results are as reliable as possible.
We are attempting to find a link between the predator mass and the prey mass, and \(\log()\) of each axis is used to see the proportionality of the axes. The mathematics behind this is as follows:
\[ \log (\text{prey mas}) = m \times \text{log(predator mass)} + c , \]
where m is the gradient of the slope and c is the y-intercept (thinking of this graph as a linear model).
Taking the exponential of both sides,
\[ \text{prey mass} = \exp({m{\log(\text{predator mass})}}) + \log(D) , \] where \(\log(D) = c\).
Finally,
\[ \text{prey mass} = \text{predator mass}^m \times D \]
Hence, we want \(m = \pm 1\) to show that predator mass is linearly proportional to prey mass (as expected). However, to simply show that the two variables are dependent on each other, we are looking for instances where \(m \neq 0\).
#message=FALSE means some output messages aren't printed in the knitted HTML document
ggplot(data = renamed_df, aes(lpred_weight, lprey_weight)) +
labs(title = "log(prey mass) v. log(predator mass) plot",
x="log(predator mass)", y="log(prey mass)") +
geom_hex(bins = 20) +
scale_fill_viridis(direction=-1, option="C") +
stat_smooth(method='lm', se=FALSE, colour="blue") +
theme_classic() +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
# theme_classic() removes the grey background of the plot to make it easier to read and understand
# stat_smooth adds a line of best fit plotted through the points
# method='lm' ensures it is a straight line, i.e. a linear model
# se=FAlSE creates only a single line, with no error of margin included
#geom_hex creates "bins" in the data where the density is calculated over
#bins=20 means that there are 20 "bins" in the horizontal direction and 20 in the vertical
#scale_fill_viridis creates a density colour scale, where the least dense area is yellow and the most dense is purple
slope <- lm(lprey_weight ~ lpred_weight, data = renamed_df)
cat("Slope of the log(prey mass) v. log(predator mass) line of best fit:",
summary(slope)$coefficients[2],
"\n",
"Standard error of this line of best fit:", summary(slope)$coefficients[4])
## Slope of the log(prey mass) v. log(predator mass) line of best fit: 0.6712536
## Standard error of this line of best fit: 0.001915732
#'cat()' prints some output message as purely the characters included in the '()' section
# summary(slope)$coefficients creates an array containing the coefficients of the linear model of the relationship between two axes, including the slope of the LOBF and the standard error of this line
# The first item of the array gives out the y-intercept of the line, and the second item of the array is the gradient
Also, the gradient of the slope is equal to \(0.6712536 \pm 0.001915732\). This is not equal to \(0\), and therefore this plot supports the assumption that the predator and prey masses are dependent on each other in some way.
However, for this graph the most dense area (the area with the greatest number of points in a single hexagonal ‘bin’) is purple, and the least dense area is coloured in yellow. Therefore, we can see that there are lots of observations around the point where \(\log(\text{predator mass}) = 6\) and \(\log(\text{prey mass}) = 0\), and near where \(\log(\text{predator mass}) = -1\) and \(\log(\text{prey mass}) = -5\). Therefore, the line of best fit (in blue) isn’t plotted directly through these most dense areas, and thus we can’t claim that this line of best fit is very reliable.
This graph is plotted over the entire data set and includes values from all of the available predator species. Instead, we will separate the plots by predator species to see if there is some proportionality between the predator and prey mass for each specific species.
Here, we are splitting up the graphs over each individual predator species to look and see if there is a proportionality between the predator and prey masses for each specific predator species.
ggplot(data = renamed_df, aes(lpred_weight, lprey_weight, colour=pred_species)) +
labs(title = "log(prey mass) v. log(predator mass), separated by predator species",
x="log(predator mass)", y="log(prey mass)") +
geom_point(size=0.01) +
facet_wrap(~pred_species, scale="free_y") +
geom_smooth(method='lm', colour="black") +
theme(legend.position="none") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13),
strip.text = element_text(size = 10))
#the grey bands around the lines of best fit represent the standard error of this line
i <- 1
species_grad = c()
uncert=c()
#setting i=1 for the while loop
#and creating an empty vector called 'species_grad' to hold data
while(i<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[i]))
grad <- coef(lm((species_df$lprey_weight)~(species_df$lpred_weight)))
species_grad[i] <- grad[2]
preyvpred.lm <- lm(lprey_weight ~ lpred_weight, data = species_df)
uncert[i] <- summary(preyvpred.lm)$coefficients[4]
i=i+1
}
#'while' means that this function repeats through the length of 'species_list' until all predator species are accounted for
#i=i+1 increases the value of i each time through this loop to motivate this repeat
#this creates a data frame (species_df) containing each individual predator species, then calculates the gradient of the log(pred mass) v. log(prey mass) graph for this individual predator species, then inserts this gradient value to the ith value of an array named 'species_grad'
#this also creates a linear model of log(prey mass) against log(predator mass) for each individual predator species, and extracts the standard error of the line of best fit for this linear model, storing this in an array called 'uncert'
pvp_grads <- data.frame("Predator_Species"=c(species_list),
Gradient=c(species_grad),
Error=c(uncert))
kable(pvp_grads)
| Predator_Species | Gradient | Error |
|---|---|---|
| Blue Whiting | -0.2433567 | 0.0912603 |
| Cod | 0.7269038 | 0.0019058 |
| Common Dab | 0.6359695 | 0.0519601 |
| European Hake | 0.7677213 | 0.0271726 |
| Haddock | 0.7004115 | 0.0069827 |
| Herring | 0.1507461 | 0.0228279 |
| Horse Mackerel | 0.5960227 | 0.0450872 |
| Mackerel | 0.2379693 | 0.0245757 |
| Megrim | 1.1524540 | 0.0593195 |
| Monkfish | 0.8168852 | 0.0269028 |
| Norway Pout | 0.9604312 | 0.0371339 |
| Plaice | 0.7014262 | 0.0155969 |
| Poor Cod | 1.0845397 | 0.1052513 |
| Sole | 0.3803803 | 0.1200316 |
| Sprat | 0.5874927 | 0.1286451 |
| Whiting | 0.6945304 | 0.0028509 |
#creates a new data frame name 'pvp_grads', with one column named 'pred_species' and the other named 'gradient'
#each row contains the name of some individual predator species and its associated gradient
#kable prints the information in the pvp_grads
We are wanting gradient \(\neq 0\) to suggest dependence between the predator and prey mass, and specifically gradient \(= \pm 1\) to suggest that they are linearly proportional.
The gradient \(\neq 0 \pm\) (their standard error) for all of the predator species. Therefore, the assumption that the predator and prey masses are dependent on each other is a reasonable one and it holds for every predator species.
Having proportionality between the predator and prey masses shows us that using the PPMR (predator prey mass ratio) is a reasonable decision when using this data set.
Here, we are looking for the most common log(PPMR) value for each individual species. This will be found by plotting the density of the log(PPMR) observations, and weighting these observations based on two different variables. By assuming that these are normally distributed relations, there should be a ‘peak’ point of log(PPMR) which is where the most commonly seen log(PPMR) value lies. We assume this will be unique for each predator species, i.e. that each predator has a preferred PPMR value, and hence each predator species has a preferred prey size relative to the mass of the predator that it most frequently consumes.
These plots are weighted by the prey biomass. This means that we are looking the biomass density of \(\log\)(PPMR), so observations are counted up according to their biomass. This means that a single observation which includes prey of a large biomass will represent a large density, and a single observation of prey of a small biomass will be shown as having a small density,
ggplot(data = renamed_df, aes(x=lppmr, colour=pred_species)) +
labs(title = "Density plot of log(PPMR), weighted by biomass density of prey",
x="log(PPMR)", y="Biomass density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
geom_density(aes(weight = total_prey_weight)) +
theme(legend.position="none") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13),
strip.text = element_text(size = 10))
#using facet_wrap for the variable (pred_species) means the data is separated into individual plots for each predator species
#geom_density means area under the curve = 1 (i.e. the graph is normalised)
#weight=total_prey_weight means the points are 'weighted' by the column weight (biomass) of each individual prey
These plots are weighted by the number pf prey per observation. This means that data points with a larger number of prey per observation are prioritised/weighted more than those with a smaller number of prey per observation.
ggplot(data = renamed_df, aes(x=lppmr, colour=pred_species)) +
labs(title = "Density plot of log(PPMR), weighted by number density of prey",
x="log(PPMR)", y="Number density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
geom_density(aes(weight = no._prey_per_stmch)) +
theme(legend.position="none", plot.title = element_text(size=15),
axis.title = element_text(size=13), strip.text = element_text(size = 10))
#weight=no._prey_per_stmch means the points are 'weighted' by the number of prey per observation
By looking at the two differently weighted graphs, we can see that the mean of the plots are ‘shifted’ by some amount (e.g. the Blue Whiting when weighted by prey biomass has a mean log(PPMR) of roughly 4.8, and when weighted by number of prey this mean becomes roughly 7.5).
Here, we are only looking at observations relating of the predator type Herring.
herringDF <- renamed_df %>%
filter(pred_species == fixed("Herring"))
#creates a separate data set (called 'herringDF') only containing observations for the predator species 'Herring'
The curve with points weighted by the prey biomass is plotted in blue, and a normal curve (also weighted by prey biomasss) is plotted over the top as a dashed black line.
bio_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$total_prey_weight, na.rm = TRUE)
bio_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$total_prey_weight, na.rm = TRUE))
#weighted.mean gives the arithmetic mean of log(PPMR), where datapoints are weighted by the prey weight of observations
#similarly, wtd.rvar is the variance of log(PPMR), where the datapoints are also weighted by the prey weight
#standard deviation is defined as the square root of the variance
#na.rm=TRUE means any rows with missing values (values that equal 'na') aren't included in the mean/variance calculations, but are instead ignored
ggplot(data = herringDF, aes(x=lppmr)) +
labs(title = "Density plot of log(PPMR) for Herring,
weighted by prey biomass",
x="log(PPMR)",y="Biomass density of observations") +
geom_density(aes(weight = total_prey_weight), colour="red") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13)) +
stat_function(fun = dnorm,
linetype="dashed",
args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
xlim(-5,17)
#axis.text.y adjusts the size of the axis labels on the y-axis so they are most easily readable
#stat_function adds a normal distribution curve (fun=dnorm) with a mean and standard deviation equal to what was calculated earlier for this data set
#linetype makes the line for the normal distribution into a dotted (black) line
#xlim sets limits for the x-axis so that all the necessary data points can be seen
cat("Mean value of log(PPMR), weighted by biomass of prey:", bio_herringmean,
"\n",
"Standard deviation of this:", bio_herringSD)
## Mean value of log(PPMR), weighted by biomass of prey: 6.781942
## Standard deviation of this: 2.370106
#"\n| creates a line break so that the outputs appear on different lines in the knitted html document
The curve with points weighted by the number of prey per observation is plotted in red, and a normal curve (also weighted by the number of prey per observation) is plotted over the top as a dotted black line.
no_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE)
no_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE))
#the mean and variance are both weighted by the number of prey
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "Density plot of log(PPMR) for Herring,
weighted by number of prey per observation",
x="log(PPMR)", y="Number density of observations") +
geom_density(aes(weight = no._prey_per_stmch), colour="green") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13)) +
stat_function(fun = dnorm,
linetype = "dotted",
args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
xlim(0,25)
#the xlim is different to the graph above because these data points lie over a slightly different log(PPMR) range
cat("Mean value of log(PPMR), weighted by the number of prey per observation:", no_herringmean,
"\n",
"Standard deviation of this:", no_herringSD)
## Mean value of log(PPMR), weighted by the number of prey per observation: 13.68691
## Standard deviation of this: 2.473435
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "Density plot of log(PPMR) for Herring,
with various weightings",
x="log(PPMR)", y="Number/biomass density of observations",
color="Line", linetype="Line") +
geom_density(aes(weight = no._prey_per_stmch, colour="prey biomass",
linetype="prey biomass")) +
geom_density(aes(weight = total_prey_weight, colour="number of prey per observation",
linetype="number of prey per observation")) +
stat_function(fun = dnorm,
args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD)),
aes(colour="number of prey per observation - normal curve",
linetype="number of prey per observation - normal curve")) +
stat_function(fun = dnorm,
args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD)),
aes(colour="prey biomass - normal curve",
linetype="prey biomass - normal curve")) +
theme(plot.title = element_text(size=20), axis.title = element_text(size=13),
legend.title = element_text(size=20),
legend.text = element_text(size=12),
legend.position=c(1,1),
legend.justification=c(1,1)) +
scale_color_manual(values=c('green', 'black', 'red', 'black')) +
scale_linetype_manual(values = c("solid", "dashed", "solid", "dotted")) +
xlim(-5,30)
#scale_color_manual manually adds a key to the graph to describe what the differently coloured lines represent
#scale_linetype_manual manually adds a key to the graph to describe what the differently shaped lines represent (e.g. some lines are dotted, dashed or solid)
#each curve is given a name using 'aes(colour="")', and these are then given a specific colour using 'values='
This is a graph with both the the distribution of Herring log(PPMR) as weighted by prey biomass and number of prey plotted over each other (along with appropriately weighted normal distribution curves plotted over the top of each).
and
The mathematics of shifted means for these different weightings is:
\[ (\text{predicted weighted mean})_{\text{prey biomass}} = (\text{weighted mean})_{\text{no. of prey }} - (\text{standard deviation}_{\text{no. of prey }})^2 \]
Hence, the expected result is:
\[ (\text{predicted weighted mean})_{\text{prey biomass}} = 13.68691 - (2.473435)^2 = 7.569029 \]
The \((\text{predicted weighted mean})_{\text{prey biomass}}\) lies within the standard error (\(\pm 2.370106\)) of the actual prey biomass weighted mean as given by the plot above. Therefore, the shifting mean equations are relatively accurate for this data set with these weightings.
Because we began with an assumption that \(\log\)(PPMR) is a normally distributed variable, the accuracy in weighted mean calculation under this assumption then supports the idea that the data can be plotted according to normal distributions. Therefore, we can claim that the \(\log\)(PPMR) is a normally distributed variable, allowing us to compare the behaviour of \(\log\)(PPMR) to a normal distribution in later analysis.
ggplot(data=renamed_df, aes(lpred_weight, lppmr, colour=pred_species)) +
geom_point(size=0.001) +
labs(title = "log(PPMR) v. log(predator mass) plot",
x="log(Predator mass)", y="log(PPMR)") +
stat_smooth (method='lm', colour="black") +
facet_wrap(~pred_species, scale="free_y") +
theme(legend.position="none", plot.title = element_text(size=15),
axis.title = element_text(size=13))
Graphing \(\log\)(predator mass) v. \(\log\)(PPMR) for individual predator species to see if the predator mass related to the PPMR.
\[ \log(\text{PPMR}) = m \times \log (\text{predator mass}) + c \]
where m is the gradient (assuming a linear model between the variables) and c is the y-intercept. Then,
\[ \text{PPMR} = (\text{predator mass})^m + D \]
(where \(c = \log(D)\))
To prove that \(\log\)(predator mass) is related to the \(\log\)(PPMR), we want the two to be proportional in some way and hence we are looking for slope, \(m = \pm 1\) for a linear relationship. This would then mean
\[ \log(\text{PPMR}) \propto m * \log(\text{predator mass}) \]
And therefore
\[ \text{PPMR} \propto (\text{predator mass})^m \]
since \(\log\) is simply a function applied to the axes. Therefore, a gradient of \(\pm 1\) represents a linear relationship, and a gradient \(\neq 0\) means there is some dependency between the PPMR and predator mass (as expected).
j <- 1
species_grad_ppmr = c()
uncert_ppmr = c()
while(j<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[j]))
grad <- coef(lm((species_df$lppmr)~(species_df$lpred_weight)))
species_grad_ppmr[j] <- grad[2]
ppmrvpred.lm <- lm(lppmr ~ lpred_weight, data = species_df)
uncert_ppmr[j] <- summary(ppmrvpred.lm)$coefficients[4]
j=j+1
}
ppmrvp_grads <- data.frame("Predator_species"=c(species_list),
Gradient=c(species_grad_ppmr),
"Standard_Error" = c(uncert_ppmr))
kable(ppmrvp_grads)
| Predator_species | Gradient | Standard_Error |
|---|---|---|
| Blue Whiting | 1.2433567 | 0.0912603 |
| Cod | 0.2730962 | 0.0019058 |
| Common Dab | 0.3640305 | 0.0519601 |
| European Hake | 0.2322787 | 0.0271726 |
| Haddock | 0.2995885 | 0.0069827 |
| Herring | 0.8492539 | 0.0228279 |
| Horse Mackerel | 0.4039773 | 0.0450872 |
| Mackerel | 0.7620307 | 0.0245757 |
| Megrim | -0.1524540 | 0.0593195 |
| Monkfish | 0.1831148 | 0.0269028 |
| Norway Pout | 0.0395688 | 0.0371339 |
| Plaice | 0.2985738 | 0.0155969 |
| Poor Cod | -0.0845397 | 0.1052513 |
| Sole | 0.6196197 | 0.1200316 |
| Sprat | 0.4125073 | 0.1286451 |
| Whiting | 0.3054696 | 0.0028509 |
13 out of the possible 16 predator species have a gradient \(\neq 0\), and therefore this data is very supportive of there being some level of dependence between the PPMR and predator mass.
However, not every predator species has a gradient \(\neq 0\) (\(\pm\) (their standard error), the species Megrim, Norway Pout, Poor Cod have a gradient equal to \(0\) (to 1 significant figure)). We are also ideally looking for a gradient of \(\pm 1\) to show a linear relationship. Therefore, we will continue to investigate this relationship with different plots.
Herring_df <- renamed_df %>% filter(pred_species == fixed("Herring"))
#creating a data frame only containing observations where the predator species is herring
ggplot(data=Herring_df, aes(lpred_weight, lppmr)) +
geom_point(size=0.5) +
labs(title = "log(PPMR) v. log(predator mass) plot: Herring",
x="log(Predator mass)", y="log(PPMR)", color="Weightings", linetype="Weightings") +
stat_smooth(aes(weight=no._prey_per_stmch, color='by number of prey in stomach',
linetype='by number of prey in stomach'),
method='lm', se=FALSE) +
stat_smooth(aes(weight=total_prey_weight, color='by prey biomass', linetype='by prey biomass'),
method='lm', se=FALSE) +
stat_smooth(aes(color='no weighting', linetype='no weighting'), method='lm', se=FALSE) +
scale_color_manual(values=c("green", "red", "blue")) +
scale_linetype_manual(values = c("dashed", "dotted", "solid")) +
theme_classic() +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
# se=FALSE doesn't add an area of error around the LOBF
#weighted by number of prey per observation
perstmch_weighting <- lm(lppmr ~ lpred_weight, data = Herring_df,
weight=Herring_df$no._prey_per_stmch)
#weighted by prey biomass
biomass_weighting <- lm(lppmr ~ lpred_weight, data = Herring_df,
weight=Herring_df$total_prey_weight)
#no weighting
no_weighting <- lm(lppmr ~ lpred_weight, data = Herring_df)
cat("Weighted by the number of prey per observation:",
"\n",
"Slope:", summary(perstmch_weighting)$coefficients[2],
"\n",
"Standard error:", summary(perstmch_weighting)$coefficients[4],
"\n")
## Weighted by the number of prey per observation:
## Slope: 1.310481
## Standard error: 0.04267642
cat("Weighted by the prey biomass:",
"\n",
"Slope:", summary(biomass_weighting)$coefficients[2],
"\n",
"Standard error:", summary(biomass_weighting)$coefficients[4],
"\n")
## Weighted by the prey biomass:
## Slope: 1.350805
## Standard error: 0.06601733
cat("With no weightings in the calculations:",
"\n",
"Slope:", summary(no_weighting)$coefficients[2],
"\n",
"Standard error:", summary(no_weighting)$coefficients[4],
"\n")
## With no weightings in the calculations:
## Slope: 0.8492539
## Standard error: 0.02282792
Looking at just the predator species ‘Herring’. There are three LOBF (line of best fit), weighted by different variables, each with individual values for their gradient.
For observations of the predator species Herring, the gradient is equal to \(1\) (to 1 significant to figure and \(\pm\) (their standard error)) when the line of best fit is weighted by either the prey biomass or the number of prey in the stomach. This is compelling evidence for some dependency between the PPMR and the predator mass (and it appears that this is linear).
We are now proceeding with the idea that the \(\log(\text{predator mass})\) and \(\log(\text{PPMR})\) are dependent on one another in some way, and hence the predator mass is dependent on the PPMR.
Residuals with a constant variance will show a density that is approximately normally distributed. This also means that the residuals all lie close to the line of best fit and therefore the data is strongly linearly related.
`Heteroscedasticity’ is when the residuals (error terms) are unequally scattered throughout a plot. Therefore, if residuals do not show a normal distribution (and instead show heteroscedasticity), then it is unclear whether or not the method of a linear relationship is appropriate to fit the data set.
For the arbitrarily selected predator species Blue Whiting, we get the following plots which all discuss the idea of normally distributed residuals.
BlueW_df <- renamed_df %>% filter(pred_species == fixed("Blue Whiting"))
BlueW_model <- lm(lppmr ~ lpred_weight, data=BlueW_df)
res <- resid(BlueW_model)
predicted <- predict(BlueW_model)
#lm() fits a regression models for log(PPMR) against log(predator mass)
#resid() creates a list of the residual of each individual data point
#predict() gives the predicted value of a data point by using the previous behaviour and patterns of the data to determine possible future values
ggplot(BlueW_df, aes(x = lpred_weight, y = lppmr)) +
geom_smooth(method = "lm", se = FALSE, colour="black") +
geom_segment(aes(xend = lpred_weight, yend = predicted), colour="red", size=0.1) +
geom_point(size=0.5) +
labs(title='The log(predator mass) against log(PPMR) graph,
with lines from the individual points to the line of best fit: Blue Whiting',
x='log(predator mass)', y='log(PPMR)') +
theme(plot.title = element_text(size=13), axis.title = element_text(size=13))
#showing the residuals of each point to the line of best fit
#geom_smooth() creates the regression line of the regression model from lm()
#geom_segment() draws a line from each point to the regression line
This plot shows how distributed the points are in comparison to the line of best fit. For a ‘well fitting’ line of best fit, we would expect the residuals to be approximately normally distributed. As discussed earlier, the residual of a point is equal to: \[ \text{residual} = \text{actual value} - \text{predicted/fitted value} , \] where the predicted/fitted value is equal to the value as predicted by the line of best fit.
It is difficult to understand how the residuals are distributed from this plot, so instead we will consider plots of the residuals against the fitted values, of the density of the residuals, and a QQPlot.
To further investigate this linearity, we plot the residuals (as calculated from the linear model) against the fitted values of \(\log\)(PPMR) as predicted by the linear model being fitted to the data.
ggplot(BlueW_model, aes(x = .fitted, y = .resid)) +
geom_point(colour="red", size=0.5) +
geom_hline(yintercept = 0, colour="grey") +
labs(title='Plot of the residuals against the fitted values: Blue Whiting',
x='Fitted values of log(PPMR)', y='Residuals') +
geom_smooth() +
theme_classic() +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#this plots the residuals of each data point, and adds a line across at res=0 to help visualise the data
#.resid creates a list of the residuals of each individual data point
This plot is used to detect unequal residual variances and any anomalous points. A linear model will show a roughly equal distribution of residuals across the fitted values, meaning that the residuals are equally far from the line of best fit across all fitted values.
Plotting this relationship for the species Blue Whiting shows that the residuals are approximately evenly distributed across all the fitted values of \(\log\)(PPMR). This suggests that the residuals are normally distributed, which is compelling evidence for a linear relationship between \(\log\)(PPMR) and \(\log\)(predator mass).
A QQplot (quantile-quantile plot) shows if two data sets come from the same distribution. For this data set, we plot the actual residuals against the expected residuals if the residuals were normally distributed. The points on this graph should fall on the straight line (at a 45 degree angle), as this is what we would expect for normally distributed residuals.
qqnorm(res, xlab = "Theoretical Residuals", ylab = "Sample Residuals",
main="Normal Q-Q plot of residuals - for the species Blue Whiting")
qqline(res, col="red")
#plot a Q-Q plot to determine if the residuals follow a normal distribution
#for a normal distribution, we would expect the data points to fall along roughly a straight line at a 45 degree angle
We can see that these points fall along the 45 degree-angled line fairly strongly, and therefore we have fairly strong evidence for normally distributed residuals. However, there is some variation from this 45 degree line for especially small and large residuals, which suggests that these residuals are not perfectly normally distributed.
By plotting the density of the residuals, we see how frequently different residuals occur in this model. Normally distributed residuals suggest there is a linear relationship between the axes because the residuals are mostly clustered around \(= 0\) and therefore the data points lie close to the line of best fit. However, unequally scattered residuals (known as `heteroscedasticity’) suggest that there is little or no relation between the two axes (here, the \(\log\)(PPMR) and the \(\log\)(predator mass)) and that therefore a linear relationship is not appropriate to fit the data set to.
ggplot(BlueW_model, aes(x = .resid)) +
geom_density(colour="red") +
labs(title='Density plot of the residuals: Blue Whiting', x='Residuals', y='Density') +
theme_classic() +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#plots the density of the residuals of the lppmr v lpred mass graph for Herring
For observations from the predator species of Blue Whiting, we can see that the density of residuals looks approximately normally distributed, with a `peak’ near the centre and densities which trail off towards \(0\) either side of this peak. We can therefore claim that this normal distribution suggests a linear relation between the \(\log\)(PPMR) and the \(\log\)(predator mass), and therefore that the PPMR and predator mass are dependent on each other in some manner.
However, this plot} appears to show two clear peaks of density, located where the residual \(= -1.5\) and \(1\). Two separated peaks of density is not what is expected from a normal distribution curve. This is potentially due to some source of variance in the data which is creating these two density peaks instead of one. To further investigate this variance, I chose to plot mixed effects models (as discussed later in Sections 9).
This is a histogram plot of the residuals of the plot, separated for each predator species. As discussed earlier, a normal distribution of the density of residuals would suggest that the data fits a linear model. This would then infer that the axes would be related to one another.
model<- lm(lppmr~lpred_weight, data=renamed_df)
ggplot(model, aes(x = .resid, color=renamed_df$pred_species)) +
geom_density() +
facet_wrap(~renamed_df$pred_species, scales = "free_y") +
labs(title="Density plot of the residuals", x='Residuals', y='Density') +
theme(legend.position="none", plot.title = element_text(size=15),
axis.title = element_text(size=13))
#Plotting the density of the residuals for each predator species
From these plots, we can see that most of the species show roughly a normal distribution of residuals, with a strong centrally located `peak’ and densities which tail slowly down to \(0\) at either side of this peak. This then suggests that the data set can be linearly modelled.
However, similar to Blue Whiting, many of these predator species show multiple peaks in their densities. Therefore, we can suggest that there is some amount of variance which affects many predator species in the data set and causes this multiple-peak behaviour.
To continue this analysis, we use a different model (a mixed effects model) to fit this data set to in order to more accurately approximate certain parameters in the variable relationships (and to account for some variance caused by certain factors in the data set).
We will now fit the data to a mixed effects model, which allows us to account for some variance that external effects may cause to the data.
A simple linear model is mathematically represented by: \[ \log(\text{PPMR})_i = \beta_1 \times \log(\text{predator mass})_i + \beta_0 \]
Where:
renamed_df %>%
ggplot(aes(x=lpred_weight, y=lppmr, group=pred_species, color=pred_species)) +
geom_point(size=0.1) +
labs(x='log(predator mass)', y='log(PPMR)',
title='A plot of the observations of log(PPMR)
against the log(predator mass)') +
theme_classic() +
theme(plot.title = element_text(size=13), axis.title = element_text(size=13))
#theme_classic() removes the grey background of the plot to make this plot easier to read
This plot is very difficult to interpret, and the relationship between the \(\log(\text{PPMR})\) and \(\log(\text{predator mass})\) is difficult to visualise as there is not a single gradient for each predator species. To aid in our interpretation, We will add random effects to account for some of the variance in the \(\log(\text{PPMR})\) values.
Here, there is the same slope for every predator species, but each species group has a different y-intercept (i.e. the line for each predator species group crosses the \(\log\)(predator mass)=0 line at a different value for \(\log\)(PPMR)). This information is shown in the table and the plot below.
Fixed effects: \(\log\)(predator mass)
Random effects: year (fixed slopes and random intercepts)
This is mathematically represented by:
\[ \log(\text{PPMR})_{iy} = (\beta_{0} + \alpha_{0y}) + \beta_{1} \log(\text{predator mass})_{i} + \epsilon_{iy} \]
Where:
one <- lmer(lppmr ~ lpred_weight + (1|year), data = renamed_df, REML=FALSE)
#same slope for every predator species, but varying intercept
year_list <- c(sort(unique(renamed_df$year)))
year_list_2000 <- year_list[c(year_list>2000)]
#a vector only including the values for year which are greater than 2000
one_df <- data.frame(Year=c(year_list),
"Slope"=c(coef(one)$year[2]))
one_df <- one_df %>% transmute(Year, Slope=lpred_weight)
one_df <- one_df[one_df$Year %in% year_list_2000, ]
rownames(one_df) <- NULL
#This removes the numbers associated with each row to make the output neater
kable(one_df)
| Year | Slope |
|---|---|
| 2001 | 0.1218916 |
| 2002 | 0.1218916 |
| 2004 | 0.1218916 |
| 2005 | 0.1218916 |
| 2006 | 0.1218916 |
| 2007 | 0.1218916 |
| 2008 | 0.1218916 |
| 2009 | 0.1218916 |
| 2010 | 0.1218916 |
| 2011 | 0.1218916 |
| 2012 | 0.1218916 |
| 2013 | 0.1218916 |
| 2015 | 0.1218916 |
| 2016 | 0.1218916 |
#This outputs the intercept term for each predator species for the fitted model, and only for values where the year is after 2000
This table shows a sample of the estimated slope values for the years between 2001-2016. We can see that each year has the same value of slope.
plot_model(one, type = "re", vline.color="grey", dot.size=1.1, value.size=3,
show.values=T, value.offset=.3) +
labs(x='Year', y='Value of random effect',
title='Random effects: Intercepts - Model 1') +
xlim(factor(year_list_2000))
#For a mixed effects model, using type="re" then plots the estimated values of the random effects
This is a plot showing the estimated mean values of the random effects, and how they are approximately distributed.
So, we can see that the y-intercepts of this mixed effects model are randomly distributed round some central value, and that this y-intercept varies across the years.
Fixed effects: \(\log\)(predator mass)
Random effects: year (random slope and fixed intercept)
This is mathematically represented by:
\[ \log(\text{PPMR})_{ij} = \beta_{0} + (\beta_{1} + \alpha_{1y}) \log(\text{predator mass})_{i} + \epsilon_{iy} \]
Where:
two <- lmer(lppmr ~ lpred_weight + (0 + lpred_weight|year), data = renamed_df, REML=FALSE)
#(0+ | ) means that the slope of the graph is different for each predator species, but they each group has the same intercept
For this plot, the value of log(predator mass) is modelled to be dependent on the year that each observation is taken. This allows us to have lines for each unique year which all have the same intercept but differing slopes.
two_df <- data.frame(Year=c(year_list),
"Intercept"=c(coef(two)$year[1]))
two_df <- two_df %>% transmute(Year, Intercept=X.Intercept.)
two_df <- two_df[two_df$Year %in% year_list_2000, ]
rownames(two_df) <- NULL
kable(two_df)
| Year | Intercept |
|---|---|
| 2001 | 5.553805 |
| 2002 | 5.553805 |
| 2004 | 5.553805 |
| 2005 | 5.553805 |
| 2006 | 5.553805 |
| 2007 | 5.553805 |
| 2008 | 5.553805 |
| 2009 | 5.553805 |
| 2010 | 5.553805 |
| 2011 | 5.553805 |
| 2012 | 5.553805 |
| 2013 | 5.553805 |
| 2015 | 5.553805 |
| 2016 | 5.553805 |
We can see from this table that every group for year has the same value for its y-intercept.
plot_model(two, type = "re", vline.color="grey", dot.size=1.1, value.size=3,
show.values=T, value.offset=.3) +
labs(x='Year', y='Value of random effect', title='Random effects: Slope - Model 2') +
xlim(factor(year_list_2000))
However, the slope is randomly distributed for each year group, and is unique for each group of year.
Fixed effects: \(\log\)(predator mass)
Random effects: year (random slope and random intercept)
This is mathematically represented by: \[ \log(\text{PPMR})_{ij} = (\beta_{0} + \alpha_{0j}) + (\beta_{1} + \alpha_{1j}) \log(\text{predator mass})_{i} + \epsilon_{ij} \]
Where:
three <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|year), data = renamed_df, REML=FALSE)
#Random intercept and random slope (correlated)
plot_model(three, type = "re", vline.color="grey", dot.size=1.1, value.size=3,
show.values=T, value.offset=.3) +
labs(x='Year', y='Value of random effect', title='Random effects - Model 3') +
xlim(factor(year_list_2000))
From the table, we can see that this model features slopes and intercepts that are unique for each individual year.
anova(one,two,three)
## Data: renamed_df
## Models:
## one: lppmr ~ lpred_weight + (1 | year)
## two: lppmr ~ lpred_weight + (0 + lpred_weight | year)
## three: lppmr ~ lpred_weight + (1 + lpred_weight | year)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## one 4 1148144 1148186 -574068 1148136
## two 4 1157243 1157285 -578618 1157235 0 0
## three 6 1140374 1140437 -570181 1140362 16873 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#comparing the three models
We can see that the third trial has a significantly smaller BIC value, and hence we will choose to continue using models which have slopes and intercepts for their random effects which vary according to the fixed effect.
Here, all the random effects are modeled with a randomly distributed slope and intercept for each level of a grouping.
In this model,
mackerel_df <- renamed_df %>% filter(pred_species == fixed("Mackerel"))
a <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short), data = mackerel_df, REML=FALSE)
summary(a)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short)
## Data: mackerel_df
##
## AIC BIC logLik deviance df.resid
## 129180.0 129228.8 -64584.0 129168.0 25067
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3883 -0.8476 -0.0531 0.8054 3.2284
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## haul_id_short (Intercept) 5.67578 2.3824
## lpred_weight 0.06209 0.2492 -0.03
## Residual 10.07684 3.1744
## Number of obs: 25073, groups: haul_id_short, 17
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.6113 0.7486 7.496
## lpred_weight 0.5082 0.1036 4.907
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.482
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00396204 (tol = 0.002, component 1)
This is mathematically represented by:
\[ \log(\text{PPMR})_{ih} = (\beta_{0} + H_{0h}) + (\beta_{1} + H_{1h}) \log(\text{predator mass})_{i} + \epsilon_{ih} \]
Where:
qqnorm(resid(a), xlab = "Theoretical Residuals", ylab = "Sample Residuals",
main="Normal Q-Q plot of residuals - mixed effect model with ship name")
qqline(resid(a), col="blue")
We have plotted a qqplot which shows the actual residuals (calculated from the data set) plotted against the expected residuals if they were completely normally distributed. If the residuals are perfectly normally distributed, then the points should fall on the straight line which is at a 45 degree angle.
We can see that the points fall roughly on the straight line but not completely, and therefore the data set does not very accurately fit to the model.
mackerel_df %>%
ggplot(aes(x = resid(a))) +
geom_density(colour="blue") +
labs(title='Density plot of the residuals for the model with ship name
as a random effect: Mackerel', x='Residuals', y='Density') +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
#resid(a) gives the residuals of model a
#The residuals are found by subtracting the fitted from the predicted values
We can also plot the density of the residuals of this model. Normally distributed residuals suggests that the residuals have a constant variance and hence the axes are related in some way.
This model has:
b <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short) + (1+lpred_weight|year),
data = mackerel_df, REML=FALSE)
summary(b)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 +
## lpred_weight | year)
## Data: mackerel_df
##
## AIC BIC logLik deviance df.resid
## 129178.9 129252.1 -64580.5 129160.9 25064
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3848 -0.8478 -0.0527 0.8037 3.2287
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## haul_id_short (Intercept) 2.85041 1.6883
## lpred_weight 0.03410 0.1847 1.00
## year (Intercept) 2.29299 1.5143
## lpred_weight 0.04607 0.2146 -1.00
## Residual 10.07302 3.1738
## Number of obs: 25073, groups: haul_id_short, 17; year, 15
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.3859 0.7748 6.952
## lpred_weight 0.5352 0.1157 4.626
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.529
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
Mathematical representation \[ \log(\text{PPMR})_{ihy} = (\beta_{0} + H_{0h} + Y_{0y}) + (\beta_{1} + H_{1h} + Y_{1y}) \log(\text{predator mass})_{i} + \epsilon_{ihyr} \]
Where:
qqnorm(resid(b), xlab = "Theoretical Residuals", ylab = "Sample Residuals",
main="Normal Q-Q plot of residuals - with ship name and year")
qqline(resid(b), col="red")
mackerel_df %>%
ggplot(aes(x = resid(b))) +
geom_density(colour="red") +
labs(title='Density plot of the residuals for the model with ship name and year
as random effects', x='Residuals', y='Density') +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
We see that the points lie closer to the expected 45 degree line in the QQPlot and the residuals appear slightly more clustered around a central point. This is all suggestive of this model fitting in a more preferable way compared to only using the ship name as a random effect.
This is a model with the variables:
c <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short) + (1+lpred_weight|year) +
(1+lpred_weight|ices_rectangle), data = mackerel_df, REML=FALSE)
summary(c)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 +
## lpred_weight | year) + (1 + lpred_weight | ices_rectangle)
## Data: mackerel_df
##
## AIC BIC logLik deviance df.resid
## 126546.3 126643.9 -63261.2 126522.3 25061
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7025 -0.7479 -0.0034 0.7178 3.1882
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ices_rectangle (Intercept) 21.036719 4.58658
## lpred_weight 0.618101 0.78619 -0.97
## haul_id_short (Intercept) 10.066727 3.17281
## lpred_weight 0.004668 0.06832 -1.00
## year (Intercept) 3.845649 1.96103
## lpred_weight 0.075033 0.27392 -1.00
## Residual 8.790453 2.96487
## Number of obs: 25073, groups: ices_rectangle, 364; haul_id_short, 17; year, 15
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.8912 1.2227 4.000
## lpred_weight 0.5958 0.1493 3.992
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.815
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
Mathematical representation: \[ \log(\text{PPMR})_{ihyr} = (\beta_{0} + H_{0h} + Y_{0y} + R_{0r} )+ (\beta_{1} + H_{1h} + Y_{1y} + R_{1r}) \log(\text{predator mass})_{i} + \epsilon_{ihyr} \]
Where:
qqnorm(resid(c), xlab = "Theoretical Residuals", ylab = "Sample Residuals",
main="Normal Q-Q plot of residuals - with ship name, year, and location")
qqline(resid(c), col="green")
For this third model which uses all three random effects, the observations lie fairly strongly on this 45 degree line. There is some movement away from the line, especially at very small and very large values of the theoretical residuals. However, this is the most compelling QQplot we have seen, and therefore suggests that using all three random effects gives the most normally distributed set of residuals.
mackerel_df %>%
ggplot(aes(x = resid(c))) +
geom_density(colour="green") +
labs(title='Density plot of the residuals for the model with ship name, year,
and location as random effects', x='Residuals', y='Density') +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
We can also see that the density of residuals graphs looks very similar to a normally distributed graph. Therefore, using the three random effects of ship name, year, and location produces the most well-fitting model for this data set.
ggplot(data=mackerel_df) +
labs(title='Density plot of the residuals: Mackerel', x='Residuals', y='Density',
color="Model", linetype="Model") +
geom_density(aes(x=resid(a), colour="with ship name", linetype="with ship name")) +
geom_density(aes(x=resid(b), colour="with ship name and year",
linetype="with ship name and year")) +
geom_density(aes(x=resid(c), colour="with ship name, year and location",
linetype="with ship name, year and location")) +
scale_color_manual(values=c("blue", "red", "green")) +
scale_linetype_manual(values = c("dashed", "dotted", "dotdash")) +
theme_classic()
anova(a,b,c)
## Data: mackerel_df
## Models:
## a: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short)
## b: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 + lpred_weight | year)
## c: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 + lpred_weight | year) + (1 + lpred_weight | ices_rectangle)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## a 6 129180 129229 -64584 129168
## b 9 129179 129252 -64580 129161 7.1001 3 0.06878 .
## c 12 126546 126644 -63261 126522 2638.5863 3 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#comparing the three models
The BIC value is the smallest when all three random effects are considered, and the residuals for this three-random-effect model look the most similar to a normal distribution. Therefore, we will choose to use a model which uses all three random effects for further analysis.
The third model (with ship name, year and location as random effects) appears to represent the data the most effectively. We now want to consider a mixed effects model over all the predator species (not just Mackerel). Therefore, we choose to continue with model c (with random effects:ship name, year, ices_rectangle), and a fixed effect of predator species to allow us to make individual analysis for each unique predator species.
To note, the coefficients of each random effect are distributed according to: \(\sigma_{H_{[0h]}}\) \(sim \mathcal{N} (0, \sigma^2_{H_{[0h]}})\). However, each group \(H\) only has a single standard distribution across every subgroup \(h\).
Using the variables below,
The complete output for the final model is:
final <- lmer(lppmr ~ lpred_weight + pred_species +
(1+lpred_weight|haul_id_short) + (1+lpred_weight|year) +
(1+lpred_weight|ices_rectangle),
data = renamed_df, REML=FALSE)
summary(final)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula:
## lppmr ~ lpred_weight + pred_species + (1 + lpred_weight | haul_id_short) +
## (1 + lpred_weight | year) + (1 + lpred_weight | ices_rectangle)
## Data: renamed_df
##
## AIC BIC logLik deviance df.resid
## 1050849.1 1051131.1 -525397.5 1050795.1 254568
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7053 -0.6066 -0.0367 0.5777 6.7409
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ices_rectangle (Intercept) 0.79813 0.8934
## lpred_weight 0.02387 0.1545 -0.78
## year (Intercept) 0.79693 0.8927
## lpred_weight 0.01425 0.1194 -0.79
## haul_id_short (Intercept) 2.63966 1.6247
## lpred_weight 0.06293 0.2509 -0.44
## Residual 3.58887 1.8944
## Number of obs: 254595, groups:
## ices_rectangle, 718; year, 97; haul_id_short, 97
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.0363484 0.2580857 19.514
## lpred_weight 0.2737651 0.0377947 7.243
## pred_speciesCod -0.2123354 0.1259732 -1.686
## pred_speciesCommon Dab -0.3095996 0.1419434 -2.181
## pred_speciesEuropean Hake -1.4461576 0.1425841 -10.142
## pred_speciesHaddock 0.5266592 0.1270956 4.144
## pred_speciesHerring 1.0340250 0.1207008 8.567
## pred_speciesHorse Mackerel 1.1556948 0.2618773 4.413
## pred_speciesMackerel 1.0736578 0.1290907 8.317
## pred_speciesMegrim -0.7387652 0.1448988 -5.098
## pred_speciesMonkfish -1.9261290 0.1622835 -11.869
## pred_speciesNorway Pout 0.8562367 0.1738728 4.924
## pred_speciesPlaice 0.7833358 0.1361487 5.754
## pred_speciesPoor Cod -0.0006363 0.1552063 -0.004
## pred_speciesSole 0.6118634 0.2000691 3.058
## pred_speciesSprat 0.9023672 0.1657724 5.443
## pred_speciesWhiting -0.4030091 0.1256431 -3.208
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0264106 (tol = 0.002, component 1)
Mathematical representation: \[ \log(\text{PPMR})_{ihyrp} = (\beta_{0} + H_{0h} + Y_{0y} + R_{0r}) + (\beta_{1} + H_{1h} + Y_{1y} + R_{1r}) \log(\text{predator mass})_i + \\ \sum_{p=1}^{16} \alpha_p (\text{predator species})_p + \epsilon_{ihyr} \] Where:
and,
\[ \text{species} = \{\text{Blue Whiting}, \text{ Cod}, \text{ Common Dab}, \text{ European Hake}, \ldots, \text{ Whiting}\} \]
is an array containing all possible predator species types, so:
\[ \text{pred}_p = \begin{cases} 1, \text{if the species that observation i belongs to is } \text{species}[p] \\ 0, \text{ otherwise} \end{cases} \]
This model has the following values of coefficients:
| Variable name | Meaning | Value |
|---|---|---|
| \(\beta_{0}\) | the intercept term | 5.0363484 |
| \(\beta_{1}\) | the fixed effect (slope) for \(\log(\text{predator mass})\) | 0.2737651 |
| \(\sigma_{H[0h]}\) | s.d. for the intercept term \(H_{0h}\), dependent on the haul_id_short | 1.6247 |
| \(\sigma_{H[1h]}\) | s.d. for the slope term \(H_{1h}\), dependent on the haul_id_short | 0.2509 |
| \(\sigma_{Y[0y]}\) | s.d. for the intercept term \(Y_{0y}\), dependent on the year | 1.0074 |
| \(\sigma_{Y[1y]}\) | s.d. for the slope term \(Y_{1y}\), dependent on the year | 0.1194 |
| \(\sigma_{R[0r]}\) | s.d. for the intercept term \(R_{0r}\), dependent on the ices_rectangle | 0.8934 |
| \(\sigma_{R[1r]}\) | s.d. for the slope term \(R_{1r}\), dependent on the ices_rectangle | 0.1545 |
| \(\sigma_{\epsilon[ihyr]}\) | the s.d. of the residual/error term | 1.8944 |
and the fixed effects coefficients are:
| Predator Species | Estimated gradient (\(\alpha_p\)) |
|---|---|
| Cod | -0.2123354446 |
| Common Dab | -0.3095996172 |
| European Hake | -1.4461576412 |
| Haddock | 0.5266592309 |
| Herring | 1.0340249983 |
| Horse Mackerel | 1.1556948335 |
| Mackerel | 1.0736578125 |
| Megrim | -0.7387651598 |
| Monkfish | -1.9261290331 |
| Norway Pout | 0.8562367326 |
| Plaice | 0.7833357976 |
| Poor Cod | -0.0006362586 |
| Sole | 0.6118634430 |
| Sprat | 0.9023672482 |
| Whiting | -0.4030090827 |
When the mixed effects model was applied, I found that there was no
estimated gradient value for the predator species
Blue Whiting. This may be due to some amount of overfitting
of the model, or because observations of Blue Whiting only occurred in a
limited number of year and ship name subgroups. For conciseness in this
project we will therefore ignore the lack of a gradient value for
Blue Whiting and continue with gradient values only for the
remaining 15 predator species
As seen in Section 8.3, a QQplot discusses whether or not the residuals of this model are normally distributed. In order to fit a mixed effects model, we must have an approximately linear relation between the two axes.
qqnorm(resid(final), xlab = "Theoretical Residuals", ylab = "Sample Residuals",
main="Normal Q-Q plot of residuals - final mixed effect model")
qqline(resid(final), col="orange")
We see that the points generally lie close to this 45 degree line, although some extremely small or large residuals are not completely aligned. Due to this fairly close alignment, we can therefore claim that this suggests a normal distribution of residuals.
renamed_df %>%
ggplot(aes(x = resid(final))) +
geom_density(aes(linetype="Data", color="Data")) +
labs(title='Density plot of the residuals', x='Residuals', y='Density',
color="Line", linetype="Line") +
stat_function(fun = dnorm, aes(linetype = "Normal", color="Normal"),
args= with(final, c(mean = mean(resid(final)), sd(resid(final))))) +
scale_color_manual(values=c('orange', 'black')) +
scale_linetype_manual(values = c("solid", "dashed")) +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13),
legend.title = element_text(size=15),
legend.text = element_text(size=12),
legend.position=c(1,1),
legend.justification=c(1,1),
)
Here, we have added a normal curve (in dotted black) to the plot using the mean and standard deviation as calculated from the model. Comparing this normal curve to the density of residuals, we see that the residuals are essentially normally distributed, but with a greater density of points for which the residuals \(= 0\). Therefore, this is strong evidence that the residuals are normally distributed, and hence that there is some form of linear relationship between the \(\log\)(PPMR) and \(\log\)(predator mass). This then suggests that we can use these mixed effects models (as they are an advancement of alinear model), and therefore that our final mathematical model is appropriate for this data set.
Separating the density of residuals plot out over the predator species gives:
renamed_df %>%
ggplot(aes(x = resid(final), color=pred_species)) +
geom_density() +
labs(title='Density plot of the residuals - separated over species',
x='Residuals', y='Density') +
facet_wrap(~pred_species) +
theme(legend.position="none", plot.title = element_text(size=15),
axis.title = element_text(size=13))
For a predator which shows a very flat-looking density of residuals (such as Horse Mackerel in Figure), this means that the residuals are very scattered and display heteroscedastic behaviour. This also suggests that the observed points of the fitted \(\log\)(PPMR) against \(\log\)(predator mass) plot are very spread out and do not show a clear linear relation. Hence, we find that there are observations with a range of \(\log\)(PPMR) values for each \(\log\)(predator mass). Transforming away from the \(\log\) of the axes, we can therefore claim that predators with a flat-looking density of residuals have many values of PPMR for a each predator mass.
This then suggests that these predators consume based on a wide range of PPMRs, and thus that they consume prey of a wide distribution of masses.
We now look at specific plots for Mackerel which allow us to make some conclusions about the feeding preferences of the species Mackerel.
mackerel_df <- renamed_df %>% filter(pred_species == fixed("Mackerel"))
ggplot(mackerel_df, aes(x=prey_type, fill=prey_type)) +
geom_bar_pattern(stat = "count",
pattern_color = "white",
pattern_fill = "black",
aes(pattern = prey_type, pattern_angle = prey_type)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_text(size=13),
legend.position="none") +
labs(x="Prey type", y="Number of Observations")
Mackerel mostly eat Zooplankton and prey belonging to the group `other’.
ggplot(mackerel_df, aes(x=lppmr)) +
geom_density(aes(weight = no._prey_per_stmch), color="lightseagreen") +
labs(x="log(PPMR)", y="Number density of observations") +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13)) +
theme_classic()
Mackerel have a wide distribution of \(\log\)(PPMR) values, suggesting that there is a wide range of prey masses consumed by predator.
mackerel_df %>%
ggplot(aes(x = resid(c))) +
geom_density(colour="lightseagreen") +
labs(x='Residuals', y='Density') +
theme_classic() +
theme(plot.title = element_text(size=15), axis.title = element_text(size=13))
The residuals are fairly widely distributed. This means that the observed points of the fitted \(\log\)(PPMR) against \(\log\)(predator mass) plot are very spread out, and therefore there is a wide distribution of \(\log\)(PPMR) values across the species Mackerel. This then suggests that a predator such as Mackerel feeds on a wide distribution of prey masses relative to the mass of the predator.